/* 
    Purpose: Using the 1960 Census (5% sample), this file locates black and white men aged 
             30-50 who are fathers of a child younger than 18 in the same household. Other 
             variables necessary to create average predicted father income (in 3b) are also 
             cleaned.

    Note: Income (inctot) was asked of all individuals 14+ in the 1960 Census, 
          so there's no need to use a sample line weight (or any weight). 
          The same is true for the variable incwage.

    Creates: Census1960_fathers_ages30to50.dta
*/

clear
set more off
cd "$Mydirectory1/1_DataSources/CensusData/"

use ./input/Census1960_5pct_raw.dta, clear //download from IPUMS USA
    tab perwt //Confirmed: everyone receives a weight of 1.
            
*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

***************************
** SET UP INCOME VARIABLES
***************************

* Fix income variables 
    replace inctot=. if inctot==9999999 
    replace inctot=0 if inctot<0
            
* Family income (v1--Census variable)
    replace ftotinc=. if ftotinc==9999999 
    replace ftotinc=0 if ftotinc<0
            
* Family income (v2--manually construct by summing individual income of family unit members)
    bysort serial famunit: egen fam_income = sum(inctot)

* Count discrepancies between v1 and v2 
    /*Note: Discrepancies appear to come 
            mostly from individuals living 
            in group quarters. */       
    count if ftotinc==. & fam_income>0 

* Harmonize v1 and v2         
    replace ftotinc = fam_income if ftotinc==. & fam_income>0
            
* Household income 

    //Grab one family member's income
    sort serial famunit pernum
    by serial famunit: gen fam_head = _n==1
            
    gen temp = 0
    replace temp = ftotinc if fam_head==1 
    replace temp = 0 if ftotinc==. 

    //Add up incomes of "separate" families within a serial to get household income (i.e. income by serial number)
    bysort serial: egen hh_income = sum(temp)
    drop temp fam_head fam_income 
    
    rename ftotinc fam_income 
            
/*
   Convert income variables to 1950 dollars using the CPI: 
   Source: https://www.minneapolisfed.org/about-us/monetary-policy/inflation-calculator/consumer-price-index-1913-)
*/
    gen CPI1960 = 29.6
    gen CPI1950= 24.1
            
    foreach var of varlist inctot fam_income hh_income {
        replace `var' = `var' * (CPI1950 / CPI1960)
        }

* Keep respondents with non-zero and non-missing income 
    foreach var of varlist inctot fam_income hh_income {
        drop if `var'==0 | `var'==.
    }
    
    tempfile fulldata
    save `fulldata'

*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

***************************
** IDENTIFY FATHERS
***************************

    keep if age<18 //Restrict to children younger than 18
    keep serial poploc age
            
    replace poploc=. if poploc==0 
    drop if poploc==. //Exclude children without a father in the house

* Count number of kids per father
    gen tagkid =1
    bysort serial poploc: egen kidsperdad = total(tagkid)
    label var kidsperdad "Number of kids living in Census father's household"
            
    bysort serial poploc: keep if _n==1 //Keep all unique father ids. Some fathers will have multiple children in the Census. 
    rename poploc pernum
    drop age tagkid 

    tempfile children 
    save `children'

* Keep the sample of fathers 
    use `fulldata', clear
    merge 1:1 serial pernum using `children'

    keep if _merge==3
    drop _merge

* Household-size adjusted weight
    gen wgt1960_hhsizeadj = perwt*kidsperdad
    label var wgt1960_hhsizeadj "Alternative Census weight; adjusted for # kids in father's household"
                        
*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

***************************
** CREATE OTHER NECESSARY VARIABLES
***************************

* Keep black and white fathers ages 30 to 50
    keep if age>=30 & age<=50
    keep if race==1 | race==2

* Region of current residence 
    * Northeast: Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island, Vermont, New Jersey, New York, Pennsylvania
    * Midwest: Illinois, Indiana, Michigan, Ohio, Wisconsin, Iowa, Kansas, Minnesota, Missouri, Nebraska, North Dakota, South Dakota
    /* South: Delaware, District of Columbia, Florida, Georgia, Maryland, North Carolina, South Carolina, Virginia, West Virginia, Alabama,
              Kentucky, Mississippi, Tennessee, Arkansas, Louisiana, Oklahoma, Texas */
    * West: Arizona, Colorado, Idaho, Montana, Nevada, New Mexico, Utah, Wyoming, California, Oregon, Washington --note: Census puts AK and HI in with Pacific division
    
    gen region_merge =.
    replace region_merge =1 if (region==11 | region==12) //Northeast
    replace region_merge =2 if (region==21 | region==22) //Midwest
    replace region_merge =3 if inrange(region,31,33) //South
    replace region_merge =4 if (region==41 | region==42) //West
    tab region, m
    tab region_merge, m
            
    label define region_l 1 "NORTHEAST" 2 "MIDWEST" 3 "SOUTH" 4 "WEST"
    label values region_merge region_l
    tab region_merge, m
            
    gen south_merge = region_merge==3
    
* Education variable
    tab educd, m

    gen edu=.
    replace edu=1 if educd<=25 //<grade school (includes people with no schooling)
    replace edu=2 if educd==26 //8th grade
    replace edu=3 if inlist(educd,30,40,50,61) //<hs
    replace edu=4 if inlist(educd,60,62,63,64) //hs
    replace edu=5 if educd>64 & educ<999 //>hs. "999" is missing
    tab educd edu, m
        
*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

****************
**** ASSIGN COARSENED OCCS
****************

* Set up variables
    sort occ1950

* Count # of Census occupations in 1960 data
    bysort occ1950: gen nvals = _n ==1
    count if nvals==1 

* Separate people with occupations in 200's based on self-employment
    replace occ1950=occ1950+1000 if (occ1950>=200 & occ1950<=290) & classwkr==1
            
* Crosswalk Census occupations to coarsened ANES occupations
    merge m:1 occ1950 using ../Crosswalks/Crosswalk_1950Census_toANES.dta
    assert _merge!=1
    drop if _merge==2
    drop _merge
            
    assert occ1950==997 if occ1950ej==. //997 = occupation missing or unknown
    drop if occ1950ej==. 
    tab occ1950 if occ1950ej==99, m nol
            
*------------------------------------------------------------------------------------*
*------------------------------------------------------------------------------------*

****************
**** SAVE 
****************

* Keep relevant variables              
    rename occ1950ej fatheroccej

    keep race south_merge region_merge edu fatheroccej inctot fam_income hh_income perwt wgt* age 
                    
    gen census=1
    label var census "Census obs"
    
    foreach var of varlist inctot fam_income hh_income {
        gen log_father_`var' = log(`var')
        label var log_father_`var' "Log `var', Census"
        label var `var' "`var', Census"
    }
    
    rename fam_income faminc
    rename hh_income HHinc
        
    compress 
    save ./output/Census1960_fathers_ages30to50.dta, replace 
                    
    
